Creating a Hex Bin Map with Pell Grant Data

By Brendan Graham in tidy tuesday data viz hex bin map

September 10, 2022

library(tidytuesdayR)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(purrr)
library(skimr)
library(here)
## here() starts at /Users/brendangraham/projects/brendangraham.online
library(ggrepel)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.0.0     ✔ tune         1.0.0
## ✔ infer        1.0.3     ✔ workflows    1.0.0
## ✔ modeldata    1.0.0     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.1     ✔ yardstick    1.0.0
## ✔ recipes      1.0.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ plotly::filter()  masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(rules)
## 
## Attaching package: 'rules'
## 
## The following object is masked from 'package:dials':
## 
##     max_rules
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel
library(ghibli)
library(gghighlight)
library(highcharter)
library(ggsci)
library(usdata)

source(here::here("content", "blog", "util.R"))

Get the data

First I read in the data and do some minor cleaning:
* only include data post-1999 since it looks like 1999 isn’t a full year of data. * overwrite the state_name variable with the full state name instead of the abbreviation. This makes creating a hex bin map later on a little easier. One consequence of this is Puerto Rico and other US Territories are excluded

tt_url <- 
  "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-08-30/pell.csv"

pell <- 
  readr::read_csv(tt_url)
## Rows: 100474 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): STATE, NAME, SESSION
## dbl (3): AWARD, RECIPIENT, YEAR
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(pell)
Name pell
Number of rows 100474
Number of columns 6
_______________________
Column type frequency:
character 3
numeric 3
________________________
Group variables None

Table 1: Data summary

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
STATE 0 1 2 2 0 59 0
NAME 0 1 3 70 0 10731 0
SESSION 0 1 7 7 0 19 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
AWARD 4 1 3950027.16 12022838.13 0 268390.2 1060252 3603497 1208452160 ▇▁▁▁▁
RECIPIENT 0 1 1249.81 3517.14 0 94.0 370 1246 304583 ▇▁▁▁▁
YEAR 0 1 2008.26 5.35 1999 2004.0 2008 2013 2017 ▇▇▆▇▇
pell <-
  readr::read_csv(tt_url) %>%
  janitor::clean_names() %>%
  filter(year > 1999) %>%
  mutate(state_name = usdata::abbr2state(abbr = state))
## Rows: 100474 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): STATE, NAME, SESSION
## dbl (3): AWARD, RECIPIENT, YEAR
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pell %>% 
  get_table()

Explore the data

It looks like some time around 2009 the total dollar amount of Pell grants increased and remained at a higher level the prior to 2009. The volume of grants remained constant though, so we see the impact of this increase in the average amount per grant.

ggpubr::ggarrange(
  
  pell %>% 
    group_by(year) %>%
    tally() %>% 
    ggplot(., aes(x = year, y = n)) +
    geom_col(fill = bg_red) +
    bg_theme(base_size = 12, plot_title_size = 12) + 
    labs(x = "Year", y = "Count", title = "Total Pell Grant Volume per Year"),
  
  pell %>% 
    group_by(year) %>%
    summarise(total_amt = sum(award, na.rm = T)) %>% 
    ggplot(., aes(x = year, y = total_amt)) +
    geom_col(fill = bg_green) +
    bg_theme(base_size = 12, plot_title_size = 12) + 
    scale_y_continuous(labels = dollar) + 
    labs(x = "Year", y = "", title = "Total Pell Grant Dollar Amount per Year"),
  
  pell %>%
    group_by(year) %>%
    summarise(mean_amt = mean(award, na.rm = T)) %>% 
    ggplot(., aes(x = year, y = mean_amt)) +
    geom_col(fill = bg_blue) +
    bg_theme(base_size = 12, plot_title_size = 12) + 
    scale_y_continuous(labels = dollar) + 
    labs(x = "Year", y = "", title = "Mean Pell Grant Dollar Amount per Year"), 
  nrow = 3,
  ncol = 1
)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

Looking state by states we can see not all states behave the same. Some states spike post 2009 and come down to a new baseline level like AZ, while some increase and remain steady at a new level, like CA.

pell %>% 
  group_by(state, year) %>%
  summarise(mean_amt = mean(award, na.rm = T)) %>% 
  ungroup() %>%
  ggplot(., aes(x = year, y = mean_amt, color = state)) +
  geom_line() + 
  geom_point() + 
  scale_y_continuous(labels = dollar) + 
  gghighlight(label_key = state, use_direct_label = T, state %in% c("CA", "AZ"), use_group_by = FALSE) + 
  labs(x = "Year", y = "Amount ($)", title = "Mean Pell Grant Dollar Amount per Year") + 
  scale_color_npg() + 
  bg_theme()
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

Digging into this post 2009 increase some more, there is some evidence that the policy changes as a result of the 2008 financial crisis can explain some of what we’re seeing in the data.

Specifically, that document notes that

the recession of 2007–2009 and the subsequent slow recovery drew more students into the recipient pool. Eligibility increased as adult students and the families of dependent students experienced losses in income and assets; enrollment of eligible students also rose as people who had lost jobs sought to acquire new skills and people who would have entered the workforce enrolled in school because they could not find employment. The expansion of online education, particularly at for-profit institutions, attracted still more students, many of whom were eligible for Pell grant.

Visualizing the Change

This plot ranks states in order of change post-2009, with Arizona leading the way.

# https://www.cbo.gov/sites/default/files/cbofiles/attachments/44448_PellGrants_9-5-13.pdf

time_prep <- 
  pell %>%
  mutate(timeframe = ifelse(year < 2009, "pre_2009", "post_2009")) %>%
  group_by(year, state_name, timeframe) %>%
  summarise(total = n()) %>%
  ungroup() %>%
  group_by(state_name, timeframe) %>%
  mutate(mean_grants = mean(total)) %>%
  select(state_name, timeframe, mean_grants)
## `summarise()` has grouped output by 'year', 'state_name'. You can override
## using the `.groups` argument.
pell_time_state_comparison <- 
  pell %>%
  mutate(timeframe = ifelse(year < 2009, "pre_2009", "post_2009")) %>%
  group_by(state_name, timeframe) %>%
  summarise(total_amt = sum(award, na.rm = T),
            mean_amt = mean(award, na.rm = T),
            sd_amt = sd(award, na.rm = T)) %>%
  left_join(., time_prep, by = c("state_name", "timeframe")) %>%
  distinct()
## `summarise()` has grouped output by 'state_name'. You can override using the
## `.groups` argument.
pell_time_state_comparison %>% 
  select(state_name, mean_amt, timeframe) %>%
  pivot_wider(names_from = timeframe, values_from = mean_amt) %>%
  mutate(diff = post_2009 - pre_2009) %>%
  filter(!(is.na(state_name))) %>%
  ggplot(., aes(x = reorder(state_name, diff), y = diff, label = paste0("$", round(diff/100000,2)),  fill = log(diff))) + 
  scale_y_continuous(labels = scales::dollar) + 
  scale_fill_gradient(low = "cornflowerblue", high = bg_green) + 
  geom_col(show.legend = FALSE) + 
  coord_flip() + 
  bg_theme(base_size = 16) +
  geom_text(hjust = 1.05, family = 'RobotoMono-Regular', size = 3, color = 'white') + 
  labs(y = "Amount($)", x = "State", title =" States Ranked by Post-2009 Change in Mean Grant Amount")

We can also visualize these increases using a hex bin map, in which each state is represented by a hexagon , color coded with the post-2009 increase in avg Pell grant money received.

# https://r-graph-gallery.com/328-hexbin-map-of-the-usa.html

# Download the Hexagones boundaries at geojson format here: 
# https://team.carto.com/u/andrew/tables/andrew.us_states_hexgrid/public/map.

library(tidyverse)
library(geojsonio)
## Registered S3 method overwritten by 'geojsonsf':
##   method        from   
##   print.geojson geojson

## 
## Attaching package: 'geojsonio'

## The following object is masked from 'package:base':
## 
##     pretty
library(RColorBrewer)
library(rgdal)
## Loading required package: sp

## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## 
## rgdal: version: 1.5-32, (SVN revision 1176)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/sf/gdal
## GDAL binary built with GEOS: FALSE 
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.5-0
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
# Download the Hexagones boundaries at geojson format here: https://team.carto.com/u/andrew/tables/andrew.us_states_hexgrid/public/map.

# Load this file. (Note: I stored in a folder called DATA)
spdf <- 
  geojson_read(here::here("content", "blog", "pell", "us_states_hexgrid.geojson"),
               what = "sp")

# Bit of reformating
spdf@data <- 
  spdf@data %>%
  mutate(google_name = gsub(" \\(United States\\)", "", google_name))

# Show framework
# plot(spdf)

library(broom)
spdf_fortified <- 
  tidy(spdf, region = "google_name")

library(rgeos)
## rgeos version: 0.5-9, (SVN revision 684)
##  GEOS runtime version: 3.10.2-CAPI-1.16.0 
##  Please note that rgeos will be retired by the end of 2023,
## plan transition to sf functions using GEOS at your earliest convenience.
##  GEOS using OverlayNG
##  Linking to sp version: 1.4-7 
##  Polygon checking: TRUE

## 
## Attaching package: 'rgeos'

## The following object is masked from 'package:parsnip':
## 
##     translate
centers <- 
  cbind.data.frame(data.frame(gCentroid(spdf, byid=TRUE), id=spdf@data$iso3166_2))

ggplot() +
  geom_polygon(data = spdf_fortified, aes( x = long, y = lat, group = group), fill="skyblue", color="white") +
  geom_text(data=centers, aes(x=x, y=y, label=id)) +
  theme_void() +
  coord_map()

# Merge geospatial and numerical information
spdf_fortified <- 
  spdf_fortified %>%
  left_join(. , pell_time_state_comparison %>% 
              select(state_name, mean_amt, timeframe) %>%
              pivot_wider(names_from = timeframe, values_from = mean_amt) %>%
              mutate(diff = post_2009 - pre_2009), by = c("id" = "state_name")) 
 
# Make a first chloropleth map
ggplot() +
  geom_polygon(data = spdf_fortified, aes(fill =  diff, x = long, y = lat, group = group)) +
  scale_fill_gradient(trans = "log") +
  theme_void() +
  coord_map()

hist(spdf_fortified$diff)

# Prepare binning

spdf_fortified$bin <-
  cut(spdf_fortified$diff, 
      breaks =c(seq(0, 12000000, 2000000), Inf), 
      labels = c("0-$2 mil", "$2-$4 mil", "$4-$6 mil", "$6-$8 mil", "$8-$10 mil", "$10-$12 mil", "$12+ mil"),
      include.lowest = TRUE)
 
# Prepare a color scale coming from the viridis color palette
library(viridis)
## Loading required package: viridisLite

## 
## Attaching package: 'viridis'

## The following object is masked from 'package:scales':
## 
##     viridis_pal
my_palette <-
  viridis::inferno(6)

ggplot() +
  geom_polygon(data = spdf_fortified, aes(fill = bin, x = long, y = lat, group = group)) +
  geom_text(data=centers, aes(x=x, y=y, label=id), color="white", size=3, alpha=0.6) +
  theme_void() +
  coord_map() + 
  scale_fill_manual( 
    values = my_palette, 
    name = "Change in Pell Grants $millions", 
    guide = guide_legend(keyheight = unit(3, units = "mm"),
                          keywidth=unit(12, units = "mm"),
                         label.position = "bottom",
                         title.position = 'top', nrow=1) 
  ) +
  ggtitle("Change in Mean Pell Grants by State\nPre/Post 2009") +
  theme(
    legend.position = c(0.5, 0.9),
    text = element_text(color = "#22211d"),
    plot.background = element_rect(fill = "#f5f5f2", color = NA), 
    panel.background = element_rect(fill = "#f5f5f2", color = NA), 
    legend.background = element_rect(fill = "#f5f5f2", color = NA),
    plot.title = element_text(size= 22, hjust=0.5, color = "#4e4d47", margin = margin(b = -0.1, t = 0.4, l = 2, unit = "cm")),
  )

Posted on:
September 10, 2022
Length:
1350 minute read, 287499 words
Categories:
tidy tuesday data viz hex bin map
Tags:
tidy tuesday data viz hex bin map
See Also:
Getting started with topic modeling with dog breed traits
Combining 'Traditional' and Text-Based Models to Board Game Ratings
Comparing Traditional and Text-Based Models to Predict Chocolate Rating